wd <- "USOC-DATA-FOLDER"


# 1. build individual file ----

wave_list <- c(
  "ba_", "bb_", "bc_", "bd_", "be_", "bf_", "bg_", "bh_", "bi_",
  "bj_", "bk_", "bl_", "bm_", "bn_", "bo_", "bp_", "bq_", "br_",
  "a_", "b_", "c_", "d_", "e_", "f_", "g_", "h_", "i_", "j_", "jk_"
)


# variables to grab from each wave's data
# pidp excluded b/c that has same name across all waves

varlist_ind <- c(
  "hidp",
  "age_dv",
  "pno",
  "hgr2r",
  "hrpno",
  ) 

# VARIABLE INFO:
# pidp: cross-wave person ID
# hidp: id of household this wave (for linking to other hh members and hh
#     data
# age_dv: age at time of interview
# pno: person number
# hgr2r: relationship to hh reference person (BHPS only)
# hrpno: person number of hh reference person (needed b/c hgr2r doesn't appear
#      to exist for USoc)

# function for loading and processing waves of individual data

process_person <- function(wave) {
  ukhls_wave <- wave - 18

  # the letters used for file- and variable pre-fixes
  wave_letters <- wave_list[wave]

  # load data
  if (wave < 19) {
    dt <-
      fread(glue("{wd}/bhps_w{wave}/{wave_letters}indresp.tab"))
  } else {
    dt <-
      fread(glue("{wd}/ukhls_w{ukhls_wave}/{wave_letters}indresp.tab"))
  }

  # make a vector of the names of the variables in this wave by adding prefix

  var_list_temp <- sapply(varlist_ind, function(x) {
    glue(
      {
        wave_letters
      },
      x
    )
  })

  # drop any that we haven't selected

  drop_list <- names(dt)[!names(dt) %in% c(var_list_temp, "pidp")]
  
  dt <- dt[, (drop_list) := NULL]


  # re-order so names line up for renaming

  setcolorder(dt, var_list_temp[var_list_temp %in% names(dt)])

  # rename to cross-wave names

  setnames(dt, names(dt), c(varlist_ind[var_list_temp %in% names(dt)], "pidp"))
  
  dt[, wave := wave]

  return(dt)
}


# run all the waves through the processing code and stack

people <- rbindlist(lapply(seq_along(wave_list), process_person), fill = T)


#
# create "head" flags

# first for BHPS

people[, head := (wave < 19 & hgr2r == 1)]

# now for USoc

people[wave > 18, head := (pno == hrpno)]




# 2. build household file ----

# fihhmngrs_dv: gross monthly income month before interview
# tenure_dv: housing tenure (own no mrgt, own mrgt, rent public, rent private)
# hhdenub_xw: household cross sectional weight 


# variables to grab from each wave's data
varlist_hh <- c(
  "hidp",
  "fihhmngrs_dv", 
  "tenure_dv",
  "hhdenub_xw"
)


# function for loading and processing waves of individual data

process_hh <- function(wave) {
  # wave numbering restarts with UKHLS

  ukhls_wave <- wave - 18

  # the letters used for file- and variable pre-fixes
  wave_letters <- wave_list[wave]

  # load data
  if (wave < 19) {
    dt <-
      fread(glue("{wd}/bhps_w{wave}/{wave_letters}hhresp.tab"))
  } else {
    dt <-
      fread(glue("{wd}/ukhls_w{ukhls_wave}/{wave_letters}hhresp.tab"))
  }

  # make a vector of the names of the variables in this wave by adding prefix

  var_list_temp <- unlist(lapply(varlist_hh, function(x) {
    glue(
      {
        wave_letters
      },
      x
    )
  }))

  # drop any that we haven't selected
  
  drop_list <- names(dt)[!names(dt) %in% c(var_list_temp)]
  
  dt <- dt[, (drop_list) := NULL]

  # re-order so names line up for renaming

  setcolorder(dt, var_list_temp[var_list_temp %in% names(dt)])

  # rename to cross-wave names
  setnames(dt, names(dt), c(varlist_hh[var_list_temp %in% names(dt)]))
  
  dt[, wave := wave]

  # return dt

  return(dt)
}

# run all the waves through the processing code and stack

households <- rbindlist(lapply(seq_along(wave_list), process_hh), fill = T)


#############################################################
#
# 3. create household-level variables from individual data - e.g.,
#    employment status of male head/spouse, earnings of female
#    head/spouse, etc.
#
#############################################################


### first, create a temp dt, so don't create all these vars in the
### individual file

d_i_temp_dt <- copy(people)

### head age and h_s age by gender

d_i_temp_dt[, head_age := age_dv * head]

### collapse to HH level:

d_i_col_dt <- d_i_temp_dt[age_dv > 15 & !is.na(finnow),
  head_age = max(head_age, na.rm = TRUE),
  by = .(hidp, wave)
]

## now merge on hh variables (inner join)

d_i_hh_dt <- merge(households, d_i_col_dt, by = c("hidp", "wave"))

## save the household-level file
saveRDS(d_i_hh_dt, paste0(DATADIR, "usoc_hh_with_ind_vars.RDS")
